#### setting environment ####
require(quanteda)
require(stm)

#### data preparation ####
# document-feature matrix of newspaper editorials published from October 1, 2017 to September 30, 2018
load("Study2_dfm.Rdata")
# convert the document-feature matrix to stm format
STM.data <- convert(Study2.dfm, to = "stm")

## number of editorials (Table 1)
table(docvars(Study2.dfm, "newspaper"))

#### estimate the structural topic models and choosing the number of topics (Figures A.1 and A.2) ####
## estimating the models varying the number of topics from 20 to 160
n.topics.1 <- seq(20, 160, 10)

# estimation
for (i in 1:length(n.topics.1)) {
  set.seed(n.topics.1[i])
  STM.result <- stm(documents = STM.data$documents, 
                    vocab = STM.data$vocab, 
                    K = n.topics.1[i], 
                    prevalence = ~ s(numeric.date), data = STM.data$meta)
  save(STM.result, file = paste0("STM_result_", n.topics.1[i], ".Rdata"))
  print(paste0("Estimation of a model in which the number of topic is ", n.topics.1[i], " is completed at ", date(), "."))
}

# compute exclusivity and semantic coherence
exclusivity.evaluation.1 <- semantic.coherence.evaluation.1 <- rep(NA, length(n.topics.1))
for (i in 1:length(n.topics.1)) {
  load(paste0("STM_result_", n.topics.1[i], ".Rdata"))
  exclusivity.evaluation.1[i] <- mean(exclusivity(STM.result))
  semantic.coherence.evaluation.1[i] <- mean(semanticCoherence(STM.result, STM.data$documents))
}

# draw the figure
cairo_pdf("Figure_A1.pdf",
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2))
plot(c(1:length(n.topics.1)), exclusivity.evaluation.1, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", xaxt = "n")
axis(1, at = seq(1, length(n.topics.1), 2), 
     labels = n.topics.1[seq(1, length(n.topics.1), 2)], cex.axis = 0.9)
plot(c(1:length(n.topics.1)), semantic.coherence.evaluation.1, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", xaxt = "n")
axis(1, at = seq(1, length(n.topics.1), 2), 
     labels = n.topics.1[seq(1, length(n.topics.1), 2)], cex.axis = 0.9)
dev.off()

## estimate the models varying the number of topics from 60 to 80
n.topics.2 <- 60:80

# estimation
for (i in 2:(length(n.topics.2) - 1)) {
  set.seed(n.topics.2[i])
  STM.result <- stm(documents = STM.data$documents, 
                    vocab = STM.data$vocab, 
                    K = n.topics.2[i], 
                    prevalence = ~ s(numeric.date), data = STM.data$meta)
  save(STM.result, file = paste0("STM_result_", n.topics.2[i], ".Rdata"))
  print(paste0("Estimation of a model in which the number of topic is ", n.topics.2[i], " is completed at ", date(), "."))
}

# compute exclusivity and semantic coherence
exclusivity.evaluation.2 <- semantic.coherence.evaluation.2 <- rep(NA, length(n.topics.2))
for (i in 1:length(n.topics.2)) {
  load(paste0("STM_result_", n.topics.2[i], ".Rdata"))
  exclusivity.evaluation.2[i] <- mean(exclusivity(STM.result))
  semantic.coherence.evaluation.2[i] <- mean(semanticCoherence(STM.result, STM.data$documents))
}

# draw the figure
cairo_pdf("Figure_A2.pdf",
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2))
plot(c(1:length(n.topics.2)), exclusivity.evaluation.2, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", xaxt = "n")
axis(1, at = seq(1, length(n.topics.2), 2), 
     labels = n.topics.2[seq(1, length(n.topics.2), 2)], cex.axis = 0.9)
plot(c(1:length(n.topics.2)), semantic.coherence.evaluation.2, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", xaxt = "n")
axis(1, at = seq(1, length(n.topics.2), 2), 
     labels = n.topics.2[seq(1, length(n.topics.2), 2)], cex.axis = 0.9)
dev.off()